# Load COVID state-level data from NYT
cv_states <- read.csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")
# Load state population data
state_pops <- read.csv("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv")
# Process population data
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
# Merge the two data frames by the appropriate columns (likely 'state' and 'abb')
cv_states <- merge(cv_states, state_pops, by = "state")Lab 11
Step 1
Step 2
dim(cv_states)[1] 58094 9
head(cv_states) state date fips cases deaths geo_id population pop_density abb
1 Alabama 2023-01-04 1 1587224 21263 1 4887871 96.50939 AL
2 Alabama 2020-04-25 1 6213 213 1 4887871 96.50939 AL
3 Alabama 2023-02-26 1 1638348 21400 1 4887871 96.50939 AL
4 Alabama 2022-12-03 1 1549285 21129 1 4887871 96.50939 AL
5 Alabama 2020-05-06 1 8691 343 1 4887871 96.50939 AL
6 Alabama 2021-04-21 1 524367 10807 1 4887871 96.50939 AL
tail(cv_states) state date fips cases deaths geo_id population pop_density abb
58089 Wyoming 2022-09-11 56 175290 1884 56 577737 5.950611 WY
58090 Wyoming 2022-08-21 56 173487 1871 56 577737 5.950611 WY
58091 Wyoming 2021-01-26 56 51152 596 56 577737 5.950611 WY
58092 Wyoming 2021-02-21 56 53795 662 56 577737 5.950611 WY
58093 Wyoming 2021-08-22 56 70671 809 56 577737 5.950611 WY
58094 Wyoming 2021-03-20 56 55581 693 56 577737 5.950611 WY
str(cv_states)'data.frame': 58094 obs. of 9 variables:
$ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
$ date : chr "2023-01-04" "2020-04-25" "2023-02-26" "2022-12-03" ...
$ fips : int 1 1 1 1 1 1 1 1 1 1 ...
$ cases : int 1587224 6213 1638348 1549285 8691 524367 1321892 1088370 1153149 814025 ...
$ deaths : int 21263 213 21400 21129 343 10807 19676 16756 16826 15179 ...
$ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
$ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
$ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
$ abb : chr "AL" "AL" "AL" "AL" ...
Step 3
# format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")
# format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)
### FINISH THE CODE HERE
# order the data first by state, second by date
cv_states <- cv_states[order(cv_states$state, cv_states$date),]
# Confirm the variables are now correctly formatted
str(cv_states)'data.frame': 58094 obs. of 9 variables:
$ state : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
$ date : Date, format: "2020-03-13" "2020-03-14" ...
$ fips : int 1 1 1 1 1 1 1 1 1 1 ...
$ cases : int 6 12 23 29 39 51 78 106 131 157 ...
$ deaths : int 0 0 0 0 0 0 0 0 0 0 ...
$ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
$ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
$ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
$ abb : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states) state date fips cases deaths geo_id population pop_density abb
1029 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
597 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
282 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
12 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
266 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
78 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
tail(cv_states) state date fips cases deaths geo_id population pop_density abb
57902 Wyoming 2023-03-18 56 185640 2009 56 577737 5.950611 WY
57916 Wyoming 2023-03-19 56 185640 2009 56 577737 5.950611 WY
57647 Wyoming 2023-03-20 56 185640 2009 56 577737 5.950611 WY
57867 Wyoming 2023-03-21 56 185800 2014 56 577737 5.950611 WY
58057 Wyoming 2023-03-22 56 185800 2014 56 577737 5.950611 WY
57812 Wyoming 2023-03-23 56 185800 2014 56 577737 5.950611 WY
# Inspect the range values for each variable. What is the date range? The range of cases and deaths?
head(cv_states) state date fips cases deaths geo_id population pop_density abb
1029 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
597 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
282 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
12 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
266 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
78 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
summary(cv_states) state date fips cases
Washington : 1158 Min. :2020-01-21 Min. : 1.00 Min. : 1
Illinois : 1155 1st Qu.:2020-12-06 1st Qu.:16.00 1st Qu.: 112125
California : 1154 Median :2021-09-11 Median :29.00 Median : 418120
Arizona : 1153 Mean :2021-09-10 Mean :29.78 Mean : 947941
Massachusetts: 1147 3rd Qu.:2022-06-17 3rd Qu.:44.00 3rd Qu.: 1134318
Wisconsin : 1143 Max. :2023-03-23 Max. :72.00 Max. :12169158
(Other) :51184
deaths geo_id population pop_density
Min. : 0 Min. : 1.00 Min. : 577737 Min. : 1.292
1st Qu.: 1598 1st Qu.:16.00 1st Qu.: 1805832 1st Qu.: 43.659
Median : 5901 Median :29.00 Median : 4468402 Median : 107.860
Mean : 12553 Mean :29.78 Mean : 6397965 Mean : 423.031
3rd Qu.: 15952 3rd Qu.:44.00 3rd Qu.: 7535591 3rd Qu.: 229.511
Max. :104277 Max. :72.00 Max. :39557045 Max. :11490.120
NA's :1106
abb
WA : 1158
IL : 1155
CA : 1154
AZ : 1153
MA : 1147
WI : 1143
(Other):51184
min(cv_states$date)[1] "2020-01-21"
max(cv_states$date)[1] "2023-03-23"
Conclusion: The date range min (2020-01-21) to max (2023-03-23)
Step 4
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
# Add variables for new_cases and new_deaths:
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
cv_subset = cv_subset[order(cv_subset$date),]
# add starting level for new cases and deaths
cv_subset$new_cases = cv_subset$cases[1]
cv_subset$new_deaths = cv_subset$deaths[1]
### FINISH THE CODE HERE
for (j in 2:nrow(cv_subset)) {
cv_subset$new_cases[j] <- cv_subset$cases[j] - cv_subset$cases[j - 1]
cv_subset$new_deaths[j] <- cv_subset$deaths[j] - cv_subset$deaths[j - 1]
}
# include in main dataset
cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths
}
# Focus on recent dates
cv_states <- cv_states %>%
filter(date >= as.Date("2021-06-01"))
### FINISH THE CODE HERE
# Inspect outliers in new_cases using plotly
library(ggplot2)
library(plotly)
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
p1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) + labs(title = "New Cases by Date for Each State") + geom_point(size = .5, alpha = 0.5)
ggplotly(p1)p1<-NULL # to clear from workspace
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + labs(title = "New Deaths by Date for Each State") + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)p2<-NULL # to clear from workspace
# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0
# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
# add starting level for new cases and deaths
cv_subset$cases = cv_subset$cases[1]
cv_subset$deaths = cv_subset$deaths[1]
### FINISH CODE HERE
for (j in 2:nrow(cv_subset)) {
cv_subset$cases[j] <- cv_subset$new_cases[j] + cv_subset$cases[j - 1]
cv_subset$deaths[j] <- cv_subset$new_deaths[j] + cv_subset$deaths[j - 1]
}
# include in main dataset
cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}
library(zoo)
Attaching package: 'zoo'
The following objects are masked from 'package:base':
as.Date, as.Date.numeric
# Smooth new counts
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>% round(digits = 0)
cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>% round(digits = 0)
# Inspect data again interactively
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)#p2=NULLStep 5
# Add population normalized (by 100,000) counts for each variable
cv_states$per100k <- as.numeric(format(round(cv_states$cases / (cv_states$population / 100000), 1), nsmall = 1))
cv_states$newper100k <- as.numeric(format(round(cv_states$new_cases / (cv_states$population / 100000), 1), nsmall = 1))Warning: NAs introduced by coercion
cv_states$deathsper100k <- as.numeric(format(round(cv_states$deaths / (cv_states$population / 100000), 1), nsmall = 1))
cv_states$newdeathsper100k <- as.numeric(format(round(cv_states$new_deaths / (cv_states$population / 100000), 1), nsmall = 1))Warning: NAs introduced by coercion
# Add a naive CFR variable = deaths / cases
cv_states <- cv_states %>% mutate(naive_CFR = round((deaths * 100 / cases), 2))
# Create a `cv_states_today` variable
cv_states_today <- subset(cv_states, date == max(cv_states$date))#Step 6
### FINISH CODE HERE
library(plotly)
cv_states <- cv_states %>%
mutate(
per100k = as.numeric(ifelse(population > 0 & !is.na(population),
round(cases / (population / 100000), 1), NA)),
deathsper100k = as.numeric(ifelse(population > 0 & !is.na(population),
round(deaths / (population / 100000), 1), NA))
)
cv_states <- cv_states %>%
mutate(naive_CFR = as.numeric(ifelse(cases > 0 & !is.na(cases),
round((deaths * 100 / cases), 2), NA)))
cv_states_today <- cv_states %>%
filter(date == max(date))
print("Columns in cv_states_today:")[1] "Columns in cv_states_today:"
print(names(cv_states_today)) [1] "state" "date" "fips" "cases"
[5] "deaths" "geo_id" "population" "pop_density"
[9] "abb" "new_cases" "new_deaths" "per100k"
[13] "newper100k" "deathsper100k" "newdeathsper100k" "naive_CFR"
# pop_density vs. cases
cv_states_today %>%
plot_ly(x = ~pop_density, y = ~cases,
type = "scatter", mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70),
marker = list(sizemode = 'diameter'), opacity = 0.5)Warning: Ignoring 1 observations
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# Filter out "District of Columbia"
cv_states_today_filter <- cv_states_today %>% filter(state != "District of Columbia")
# pop_density vs. cases after filtering
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~cases,
type = "scatter", mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70),
marker = list(sizemode = 'diameter'), opacity = 0.5)Warning: Ignoring 1 observations
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
names(cv_states_today_filter) [1] "state" "date" "fips" "cases"
[5] "deaths" "geo_id" "population" "pop_density"
[9] "abb" "new_cases" "new_deaths" "per100k"
[13] "newper100k" "deathsper100k" "newdeathsper100k" "naive_CFR"
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = "scatter", mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70),
marker = list(sizemode = 'diameter'), opacity = 0.5)Warning: Ignoring 1 observations
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# Adding hoverinfo
cv_states_today_filter %>%
plot_ly(x = ~pop_density, y = ~deathsper100k,
type = "scatter", mode = 'markers', color = ~state,
size = ~population, sizes = c(5, 70),
marker = list(sizemode = 'diameter'), opacity = 0.5,
hoverinfo = 'text',
text = ~paste(
paste(state, ":", sep = ""),
paste(" Cases per 100k: ", per100k, sep = ""),
paste(" Deaths per 100k: ", deathsper100k, sep = ""),
sep = "<br>")
) %>%
layout(
title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
yaxis = list(title = "Deaths per 100k"),
xaxis = list(title = "Population Density"),
hovermode = "compare"
)Warning: Ignoring 1 observations
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Step 8
library(dplyr)
# Calculate new_cases and new_deaths
cv_states <- cv_states %>%
arrange(state, date) %>%
group_by(state) %>%
mutate(
new_cases = cases - lag(cases),
new_deaths = deaths - lag(deaths)
) %>%
ungroup()
### FINISH CODE HERE
# Line chart for naive_CFR for all states over time using `plot_ly()`
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state,
type = "scatter", mode = "lines") %>%
layout(
title = "Naive CFR Over Time for All States",
xaxis = list(title = "Date"),
yaxis = list(title = "Naive CFR (%)")
)Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
### FINISH CODE HERE
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>%
filter(state == "Florida") %>%
plot_ly(x = ~date) %>%
add_trace(y = ~new_cases, type = "scatter", mode = "lines", name = "New Cases") %>%
add_trace(y = ~new_deaths, type = "scatter", mode = "lines", name = "New Deaths") %>%
layout(
title = "New Cases and New Deaths Over Time in Florida",
xaxis = list(title = "Date"),
yaxis = list(title = "Count"),
legend = list(title = list(text = "Metric"))
)Step 9
### FINISH CODE HERE
# Map state, date, and new_cases to a matrix
library(tidyr)
cv_states_mat <- cv_states %>% select(state, date, new_cases) %>% dplyr::filter(date>as.Date("2021-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = new_cases))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)# Calculate new cases per 100,000 people
cv_states <- cv_states %>%
mutate(
newper100k = as.numeric(ifelse(population > 0 & !is.na(population),
round(new_cases / (population / 100000), 1), NA))
)
# Repeat with newper100k
cv_states_mat <- cv_states %>% select(state, date, newper100k) %>% dplyr::filter(date>as.Date("2021-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)# Create a second heatmap after filtering to only include dates every other week
filter_dates <- seq(as.Date("2021-06-15"), as.Date("2021-11-01"), by="2 weeks")
cv_states_mat <- cv_states %>% select(state, date, newper100k) %>% filter(date %in% filter_dates)
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
z=~cv_states_mat2,
type="heatmap",
showscale=T)Step 10
### For specified date
pick.date = "2021-10-15"
# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>% filter(date==pick.date) %>% select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL
# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))
# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
# Make sure both maps are on the same color scale
shadeLimit <- 125
# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>%
add_trace(
z = ~newper100k, text = ~hover, locations = ~state,
color = ~newper100k, colors = 'Purples'
)
fig <- fig %>% colorbar(title = paste0("Cases per 100k: ", pick.date), limits = c(0,shadeLimit))
fig <- fig %>% layout(
title = paste('Cases per 100k by State as of ', pick.date, '<br>(Hover for value)'),
geo = set_map_details
)
fig_pick.date <- fig
#############
### Map for today's date
cv_states <- cv_states %>%
mutate(
newper100k = as.numeric(ifelse(population > 0 & !is.na(population),
round(new_cases / (population / 100000), 1), NA))
)
# Filter for the most recent date to create `cv_states_today`
cv_states_today <- cv_states %>%
filter(date == max(date))
# Extract the data for each state by its abbreviation
cv_per100 <- cv_states_today %>% select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL
# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))
# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>%
add_trace(
z = ~newper100k, text = ~hover, locations = ~state,
color = ~newper100k, colors = 'Purples'
)
fig <- fig %>% colorbar(title = paste0("Cases per 100k: ", Sys.Date()), limits = c(0,shadeLimit))
fig <- fig %>% layout(
title = paste('Cases per 100k by State as of', Sys.Date(), '<br>(Hover for value)'),
geo = set_map_details
)
fig_Today <- fig
### Plot together
subplot(fig_pick.date, fig_Today, nrows = 2, margin = .05)%>%
layout(title = "Combined Plot of Selected Date and Today's Data")